R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#install.packages("fastDummies")
#install.packages("randomForest")
library(ggplot2)
library(reshape2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(dplyr)
library(lubridate)
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
trends_byage <- read.csv("people/trends-byage.csv")
trends_byrace <- read.csv("people/trends-byrace.csv")
trends_byboro <- read.csv("people/trends-byboro.csv")
coverage_by_demo <- read.csv("people/coverage-by-demo.csv")
trends_byage$DATE <- as.Date(trends_byage$DATE)
trends_byage <- filter(trends_byage, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-08-31"))
trends_long <- melt(trends_byage, id.vars = "DATE", 
                    measure.vars = grep("COUNT_ADDITIONAL_CUMULATIVE", names(trends_byage), value = TRUE))
names(trends_long)[names(trends_long) == "variable"] <- "Age_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))

ggplot(trends_long, aes(x = DATE, y = Cumulative_Count, color = Age_Group)) +
  geom_line() +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") + 
  scale_y_log10() +  
  labs(title = "Trends of Additional/Booster Doses by Age Group (Aug 2021 - Aug 2022)",
       x = "Date",
       y = "Cumulative Count of Additional Doses (Log Scale)") +  
  theme_minimal() +
  theme(legend.title = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1))

age_group_columns <- c("COUNT_ADDITIONAL_CUMULATIVE_0_4", "COUNT_ADDITIONAL_CUMULATIVE_5_12", 
                       "COUNT_ADDITIONAL_CUMULATIVE_13_17", "COUNT_ADDITIONAL_CUMULATIVE_18_24", 
                       "COUNT_ADDITIONAL_CUMULATIVE_25_34", "COUNT_ADDITIONAL_CUMULATIVE_35_44", 
                       "COUNT_ADDITIONAL_CUMULATIVE_45_54", "COUNT_ADDITIONAL_CUMULATIVE_55_64", 
                       "COUNT_ADDITIONAL_CUMULATIVE_65_74", "COUNT_ADDITIONAL_CUMULATIVE_75up")
trends_byage[age_group_columns] <- lapply(trends_byage[age_group_columns], function(x) as.numeric(as.character(x)))
trends_byage[is.na(trends_byage)] <- 0
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_0_17 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_0_4 +
                                                 trends_byage$COUNT_ADDITIONAL_CUMULATIVE_5_12 +
                                                 trends_byage$COUNT_ADDITIONAL_CUMULATIVE_13_17

trends_byage$COUNT_ADDITIONAL_CUMULATIVE_18_44 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_18_24 +
                                                  trends_byage$COUNT_ADDITIONAL_CUMULATIVE_25_34 +
                                                  trends_byage$COUNT_ADDITIONAL_CUMULATIVE_35_44

trends_byage$COUNT_ADDITIONAL_CUMULATIVE_45_64 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_45_54 +
                                                  trends_byage$COUNT_ADDITIONAL_CUMULATIVE_55_64

trends_byage$COUNT_ADDITIONAL_CUMULATIVE_65_74 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_65_74

trends_byage$COUNT_ADDITIONAL_CUMULATIVE_75up <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_75up

trends_byage$DATE <- as.Date(trends_byage$DATE, format = "%Y-%m-%d")
age_groups <- c("COUNT_ADDITIONAL_CUMULATIVE_0_17", "COUNT_ADDITIONAL_CUMULATIVE_18_44", 
                "COUNT_ADDITIONAL_CUMULATIVE_45_64", "COUNT_ADDITIONAL_CUMULATIVE_65_74", 
                "COUNT_ADDITIONAL_CUMULATIVE_75up")
trends_byage <- filter(trends_byage, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-05-31"))
trends_long <- melt(trends_byage, id.vars = "DATE", 
                    measure.vars = age_groups)
names(trends_long)[names(trends_long) == "variable"] <- "Age_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))

ggplot(trends_long, aes(x = DATE, y = Cumulative_Count, color = Age_Group)) +
  geom_line() +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") + 
  scale_y_log10() +  
  labs(title = "Trends of Additional Doses by Age Group",
       x = "Date",
       y = "Cumulative Count of Additional Doses (Log Scale)") +
  theme_minimal() +
  theme(legend.title = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1))

trends_byrace$DATE <- as.Date(trends_byrace$DATE, format = "%Y-%m-%d")
trends_byrace <- filter(trends_byrace, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-05-31"))
race_groups <- c("COUNT_ADDITIONAL_CUMULATIVE_AIAN", "COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI",
                 "COUNT_ADDITIONAL_CUMULATIVE_BLACK", "COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO",
                 "COUNT_ADDITIONAL_CUMULATIVE_WHITE", "COUNT_ADDITIONAL_CUMULATIVE_OTHER")
trends_long <- melt(trends_byrace, id.vars = "DATE", 
                    measure.vars = race_groups)
names(trends_long)[names(trends_long) == "variable"] <- "Race_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))

ggplot(trends_long, aes(x = DATE, y = Cumulative_Count, color = Race_Group)) +
  geom_line() +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  scale_y_log10() +
  labs(title = "Trends of Additional Doses by Race Group from August 2021",
       x = "Date",
       y = "Cumulative Count of Additional Doses") +
  theme_minimal() +
  theme(legend.title = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1))

Data Preprocessing

age_group_columns <- c("COUNT_ADDITIONAL_CUMULATIVE_5_12", 
                       "COUNT_ADDITIONAL_CUMULATIVE_13_17", "COUNT_ADDITIONAL_CUMULATIVE_18_24", 
                       "COUNT_ADDITIONAL_CUMULATIVE_25_34", "COUNT_ADDITIONAL_CUMULATIVE_35_44", 
                       "COUNT_ADDITIONAL_CUMULATIVE_45_54", "COUNT_ADDITIONAL_CUMULATIVE_55_64", 
                       "COUNT_ADDITIONAL_CUMULATIVE_65_74", "COUNT_ADDITIONAL_CUMULATIVE_75up")
trends_byage$DATE <- as.Date(trends_byage$DATE)
trends_byage <- filter(trends_byage, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-08-31"))
trends_long <- melt(trends_byage, id.vars = "DATE", 
                    measure.vars = grep("COUNT_ADDITIONAL_CUMULATIVE", names(trends_byage), value = TRUE))
names(trends_long)[names(trends_long) == "variable"] <- "Age_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))
start_year <- 2020  
start_month <- 1
for(age_group in age_group_columns) {
  group_data <- filter(trends_long, Age_Group == age_group)
  ts_data <- ts(group_data$Cumulative_Count, start=c(start_year, start_month), frequency=12)
  model <- auto.arima(ts_data)
  residuals_data <- residuals(model)

  Acf(residuals_data, main = paste("ACF of Residuals for", age_group))
}

Age Forecasting Model

start_year = 2021
start_month = 8

forecasts = list()
errors = list()

latest_data_year = 2022
latest_data_month = 5

future_periods = (2025 - latest_data_year) * 12 + (12 - latest_data_month)

for(age_group in age_group_columns) {
  group_data <- filter(trends_long, Age_Group == age_group)
  ts_data <- ts(group_data$Cumulative_Count, start=c(start_year, start_month), frequency=12)
  error <- tsCV(ts_data, forecastfunction = function(x) forecast(auto.arima(x), h=1), h=1)
  errors[[age_group]] <- error
  model <- auto.arima(ts_data)
  forecast_data <- forecast(model, h=future_periods)
  forecasts[[age_group]] <- forecast_data
}

Age Forecasting Result

for(i in 1:length(forecasts)) {
  age_group <- names(forecasts)[i]
  forecast_data <- forecasts[[age_group]]

  historical_data <- filter(trends_long, Age_Group == age_group) %>%
                     select(Date = DATE, Count = Cumulative_Count) %>%
                     mutate(Type = "Historical")

  forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1), 
                        length.out = length(forecast_data$mean), 
                        by = "month")
  
  forecast_df <- data.frame(Date = forecast_dates, 
                            Count = forecast_data$mean, 
                            Type = "Forecasted")

  combined_df <- rbind(historical_data, forecast_df)

  p <- ggplot(combined_df, aes(x = Date, y = Count, color = Type)) +
        geom_line() +
        labs(title = paste("Forecast for Age Group", gsub("COUNT_ADDITIONAL_CUMULATIVE_", "", age_group), "(2021 - 2025)"),
             x = "Date",
             y = "Cumulative Count") +
        theme_minimal() +
        theme(legend.title = element_blank())

  print(p)
}

 head(combined_df)
##         Date Count       Type
## 1 2021-08-01  7550 Historical
## 2 2021-08-02  7566 Historical
## 3 2021-08-03  7578 Historical
## 4 2021-08-04  7603 Historical
## 5 2021-08-05  7629 Historical
## 6 2021-08-06  7637 Historical
 tail(combined_df)
##           Date    Count       Type
## 342 2025-07-01 300252.4 Forecasted
## 343 2025-08-01 300350.9 Forecasted
## 344 2025-09-01 300440.4 Forecasted
## 345 2025-10-01 300538.4 Forecasted
## 346 2025-11-01 300644.6 Forecasted
## 347 2025-12-01 300745.6 Forecasted
 std_dev_combined_df <- sd(combined_df$Count)
print(std_dev_combined_df)
## [1] 111226.3

Age RMSE

rmse_results <- list()

for(i in 1:length(forecasts)) {
  age_group <- names(forecasts)[i]
  forecast_data <- forecasts[[age_group]]

  historical_data <- filter(trends_long, Age_Group == age_group) %>%
                     select(Date = DATE, Count = Cumulative_Count) %>%
                     mutate(Type = "Historical")

  forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1), 
                        length.out = length(forecast_data$mean), 
                        by = "month")

  forecast_df <- data.frame(Date = forecast_dates, 
                            Count = forecast_data$mean, 
                            Type = "Forecasted")

  end_date_for_actuals <- as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) - 1
  actual_values <- historical_data$Count[historical_data$Date <= end_date_for_actuals]

  if (length(actual_values) >= length(forecast_data$mean)) {
    actual_values <- tail(actual_values, length(forecast_data$mean))
    forecasted_values <- forecast_data$mean

    rmse <- sqrt(mean((actual_values - forecasted_values)^2, na.rm = TRUE))  
    rmse_results[[age_group]] <- rmse
  } else {
    rmse_results[[age_group]] <- NA  
  }
}
rmse_results
## $COUNT_ADDITIONAL_CUMULATIVE_5_12
## [1] 15321.36
## 
## $COUNT_ADDITIONAL_CUMULATIVE_13_17
## [1] 11232.03
## 
## $COUNT_ADDITIONAL_CUMULATIVE_18_24
## [1] 12890.49
## 
## $COUNT_ADDITIONAL_CUMULATIVE_25_34
## [1] 20305.56
## 
## $COUNT_ADDITIONAL_CUMULATIVE_35_44
## [1] 15702.67
## 
## $COUNT_ADDITIONAL_CUMULATIVE_45_54
## [1] 16352.19
## 
## $COUNT_ADDITIONAL_CUMULATIVE_55_64
## [1] 18716.3
## 
## $COUNT_ADDITIONAL_CUMULATIVE_65_74
## [1] 13180.44
## 
## $COUNT_ADDITIONAL_CUMULATIVE_75up
## [1] 9971.597

Age Moving Rate

slopes <- list()

for(i in 1:length(forecasts)) {
  
  age_group <- names(forecasts)[i]
  
  forecast_data <- forecasts[[age_group]]
  
  forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1), 
                        length.out = length(forecast_data$mean), 
                        by = "month")
  forecast_df <- data.frame(Date = forecast_dates, 
                            Count = forecast_data$mean)

  forecast_df$Time <- as.numeric(forecast_df$Date - min(forecast_df$Date))
  
  lm_model <- lm(Count ~ Time, data = forecast_df)

  slope <- coef(lm_model)["Time"]

  slopes[[age_group]] <- slope
}

slopes
## $COUNT_ADDITIONAL_CUMULATIVE_5_12
##     Time 
## 10.61337 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_13_17
##     Time 
## 4.739182 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_18_24
##     Time 
## 4.496523 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_25_34
##     Time 
## 7.108529 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_35_44
##    Time 
## 4.63574 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_45_54
##     Time 
## 4.275551 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_55_64
##     Time 
## 5.032212 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_65_74
##     Time 
## 3.915015 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_75up
##     Time 
## 3.280791

Age Moving Rate Visualization

slopes_df <- data.frame(
  AgeGroup = names(slopes),
  Slope = unlist(slopes)
)

ggplot(slopes_df, aes(x = AgeGroup, y = Slope, group = 1)) +
  geom_line() +
  geom_point() +
  labs(title = "Slope of Forecasted Trends for Each Age Group",
       x = "Age Group",
       y = "Potential Willingness") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Race Forecasting Model

trends_byrace$DATE <- as.Date(trends_byrace$DATE, format = "%Y-%m-%d")
trends_byrace <- filter(trends_byrace, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-05-31"))
race_groups <- c("COUNT_ADDITIONAL_CUMULATIVE_AIAN", "COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI",
                 "COUNT_ADDITIONAL_CUMULATIVE_BLACK", "COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO",
                 "COUNT_ADDITIONAL_CUMULATIVE_WHITE", "COUNT_ADDITIONAL_CUMULATIVE_OTHER")
trends_long <- melt(trends_byrace, id.vars = "DATE", 
                    measure.vars = race_groups)
names(trends_long)[names(trends_long) == "variable"] <- "Race_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))

for(race_group in race_groups) {
  ts_data <- ts(filter(trends_long, Race_Group == race_group)$Cumulative_Count, 
                start=c(start_year, start_month), frequency=12)
  
  model <- auto.arima(ts_data)
  residuals_data <- residuals(model)

  acf_plot <- Acf(residuals_data, main = paste("ACF of Residuals for", race_group))
  
  # ggsave(paste("ACF_plot_", race_group, ".png", sep = ""), plot = acf_plot, width = 7, height = 5)
  
  print(acf_plot)
}

## 
## Autocorrelations of series 'residuals_data', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000 -0.018 -0.047  0.125  0.022 -0.198 -0.048  0.335 -0.107 -0.157  0.020 
##     11     12     13     14     15     16     17     18     19     20     21 
## -0.009 -0.019 -0.020  0.348  0.010 -0.125  0.122  0.086 -0.085  0.030  0.375 
##     22     23     24 
## -0.075 -0.198  0.007

## 
## Autocorrelations of series 'residuals_data', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000 -0.010  0.017 -0.016  0.013  0.050 -0.066  0.154 -0.102  0.030 -0.027 
##     11     12     13     14     15     16     17     18     19     20     21 
## -0.033  0.005 -0.102 -0.039  0.017  0.075  0.060 -0.055  0.113  0.034  0.129 
##     22     23     24 
## -0.031  0.010  0.023

## 
## Autocorrelations of series 'residuals_data', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000 -0.043  0.037  0.031  0.006  0.145 -0.216  0.136 -0.144  0.018 -0.046 
##     11     12     13     14     15     16     17     18     19     20     21 
## -0.098  0.004 -0.087  0.199 -0.117  0.172 -0.018 -0.056  0.165 -0.093  0.223 
##     22     23     24 
## -0.177  0.031  0.012

## 
## Autocorrelations of series 'residuals_data', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000 -0.026 -0.044  0.114  0.058 -0.193 -0.160  0.460 -0.205 -0.183  0.103 
##     11     12     13     14     15     16     17     18     19     20     21 
##  0.007 -0.016 -0.077  0.455 -0.120 -0.025  0.123  0.033 -0.027 -0.078  0.353 
##     22     23     24 
## -0.155 -0.130  0.006

## 
## Autocorrelations of series 'residuals_data', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.011  0.063 -0.152 -0.260  0.044  0.032  0.535 -0.082  0.030 -0.157 
##     11     12     13     14     15     16     17     18     19     20     21 
## -0.277  0.105  0.034  0.456  0.002  0.036 -0.177 -0.203  0.111 -0.010  0.450 
##     22     23     24 
## -0.022 -0.050 -0.154

## 
## Autocorrelations of series 'residuals_data', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000 -0.034  0.031  0.015 -0.005  0.129 -0.185  0.160 -0.095 -0.003  0.046 
##     11     12     13     14     15     16     17     18     19     20     21 
## -0.057  0.010 -0.012  0.125 -0.072  0.174 -0.014 -0.030  0.134 -0.024  0.175 
##     22     23     24 
## -0.219  0.053  0.060
start_year = 2021
start_month = 8
latest_data_year = 2022
latest_data_month = 5
future_periods = (2025 - latest_data_year) * 12 + (12 - latest_data_month)

forecasts = list()
errors = list()

for(race_group in race_groups) {
  ts_data <- ts(filter(trends_long, Race_Group == race_group)$Cumulative_Count, 
                start=c(start_year, start_month), frequency=12)
  
  error <- tsCV(ts_data, forecastfunction = function(x) forecast(auto.arima(x), h=1), h=1)
  errors[[race_group]] <- error
  
  model <- auto.arima(ts_data)
  forecast_data <- forecast(model, h=future_periods)
  forecasts[[race_group]] <- forecast_data
}

Race Forecasting Result

for(i in 1:length(forecasts)) {
  race_group <- names(forecasts)[i]
  forecast_data <- forecasts[[i]]

  historical_data <- filter(trends_long, Race_Group == race_group) %>%
                     select(Date = DATE, Count = Cumulative_Count) %>%
                     mutate(Type = "Historical")

  forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1), 
                        length.out = length(forecast_data$mean), 
                        by = "month")

  forecast_df <- data.frame(Date = forecast_dates, 
                            Count = forecast_data$mean, 
                            Type = "Forecasted")

  if (all(names(historical_data) == names(forecast_df))) {
    combined_df <- rbind(historical_data, forecast_df)

    p <- ggplot(combined_df, aes(x = Date, y = Count, color = Type)) +
          geom_line() +
          labs(title = paste("Forecast for Race Group", gsub("COUNT_ADDITIONAL_CUMULATIVE_", "", race_group), "(2020 - 2025)"),
               x = "Date",
               y = "Cumulative Count") +
          theme_minimal() +
          theme(legend.title = element_blank())

    print(p)
  } else {
    print(paste("Column mismatch in data frames for race group:", race_group))
  }
}

Race RMSE Result

rmse_results <- list()

for(i in 1:length(forecasts)) {
  race_group <- names(forecasts)[i]
  forecast_data <- forecasts[[i]] 

  historical_data <- filter(trends_long, Race_Group == race_group) %>%
                     select(Date = DATE, Count = Cumulative_Count) %>%
                     mutate(Type = "Historical")

  forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1), 
                        length.out = length(forecast_data$mean), 
                        by = "month")

  forecast_df <- data.frame(Date = forecast_dates, 
                            Count = forecast_data$mean, 
                            Type = "Forecasted")

  if (all(names(historical_data) == names(forecast_df))) {
    combined_df <- rbind(historical_data, forecast_df)
    
    end_date_for_actuals <- as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) - 1
    actual_values <- historical_data$Count[historical_data$Date <= end_date_for_actuals]

    if (length(actual_values) >= length(forecast_data$mean)) {
      actual_values <- tail(actual_values, length(forecast_data$mean))
      forecasted_values <- forecast_data$mean

      rmse <- sqrt(mean((actual_values - forecasted_values)^2, na.rm = TRUE))  # na.rm = TRUE to handle missing values
      rmse_results[[race_group]] <- rmse
    }
  }
}

rmse_results
## $COUNT_ADDITIONAL_CUMULATIVE_AIAN
## [1] 569.5745
## 
## $COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI
## [1] 29400.16
## 
## $COUNT_ADDITIONAL_CUMULATIVE_BLACK
## [1] 26319.53
## 
## $COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO
## [1] 35724.15
## 
## $COUNT_ADDITIONAL_CUMULATIVE_WHITE
## [1] 35481.1
## 
## $COUNT_ADDITIONAL_CUMULATIVE_OTHER
## [1] 4678.765

Age Moving Rate

slopes <- list()

for(i in 1:length(forecasts)) {
  group_name <- names(forecasts)[i]
  forecast_data <- forecasts[[group_name]]
  forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1), 
                        length.out = length(forecast_data$mean), 
                        by = "month")
  forecast_df <- data.frame(Date = forecast_dates, 
                            Count = forecast_data$mean)
  forecast_df$Time <- as.numeric(forecast_df$Date - min(forecast_df$Date))
  
  lm_model <- lm(Count ~ Time, data = forecast_df)
  slope <- coef(lm_model)["Time"]
  slopes[[group_name]] <- slope
}

slopes
## $COUNT_ADDITIONAL_CUMULATIVE_AIAN
##      Time 
## 0.2563122 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI
##     Time 
## 12.75541 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_BLACK
##     Time 
## 8.413805 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO
##     Time 
## 13.68999 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_WHITE
##     Time 
## 16.36981 
## 
## $COUNT_ADDITIONAL_CUMULATIVE_OTHER
##     Time 
## 1.964457

Race Moving Rate Visualization

slopes_df <- data.frame(
  RaceGroup = names(slopes),
  Slope = unlist(slopes)
)

ggplot(slopes_df, aes(x = RaceGroup, y = Slope, group = 1)) +
  geom_line() +
  geom_point() +
  labs(title = "Slope of Forecasted Trends for Each Race Group",
       x = "Race Group",
       y = "Slope Value") +  # Adjust the y-axis label if needed
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

coverage_by_demo <- read.csv("people/coverage-by-demo.csv")
head(coverage_by_demo)
##         DATE      POPULATION   GROUP SUBGROUP POP_DENOMINATOR
## 1 2023-09-14 Children (0-17)     Age     '0-4        523718.0
## 2 2023-09-14 Children (0-17)     Age    '5-12        747558.7
## 3 2023-09-14 Children (0-17)     Age   '13-17        432845.5
## 4 2023-09-14 Children (0-17) Borough Citywide       1704122.2
## 5 2023-09-14 Children (0-17) Borough    Bronx        347020.4
## 6 2023-09-14 Children (0-17) Borough Brooklyn        575063.2
##   COUNT_PARTIALLY_CUMULATIVE COUNT_FULLY_CUMULATIVE COUNT_1PLUS_CUMULATIVE
## 1                      23733                  41832                  65565
## 2                      57152                 385370                 442522
## 3                      45273                 361209                 406482
## 4                     126158                 788411                 914569
## 5                      26828                 151948                 178776
## 6                      30057                 221873                 251930
##   COUNT_ADDITIONAL_CUMULATIVE COUNT_BIVALENT_ADDITIONAL_CUMULATIVE
## 1                         212                                18299
## 2                       68382                                55365
## 3                      125097                                38535
## 4                      193691                               112199
## 5                       27368                                16763
## 6                       54083                                32133
##   PERC_PARTIALLY PERC_FULLY PERC_1PLUS PERC_ADDITIONAL PERC_BIVALENT_ADDITIONAL
## 1           4.53       7.99      12.52            0.04                     3.49
## 2           7.65      51.55      59.20            9.15                     7.41
## 3          10.46      83.45      93.91           28.90                     8.90
## 4           7.40      46.26      53.67           11.37                     6.58
## 5           7.73      43.79      51.52            7.89                     4.83
## 6           5.23      38.58      43.81            9.40                     5.59
names(coverage_by_demo)
##  [1] "DATE"                                
##  [2] "POPULATION"                          
##  [3] "GROUP"                               
##  [4] "SUBGROUP"                            
##  [5] "POP_DENOMINATOR"                     
##  [6] "COUNT_PARTIALLY_CUMULATIVE"          
##  [7] "COUNT_FULLY_CUMULATIVE"              
##  [8] "COUNT_1PLUS_CUMULATIVE"              
##  [9] "COUNT_ADDITIONAL_CUMULATIVE"         
## [10] "COUNT_BIVALENT_ADDITIONAL_CUMULATIVE"
## [11] "PERC_PARTIALLY"                      
## [12] "PERC_FULLY"                          
## [13] "PERC_1PLUS"                          
## [14] "PERC_ADDITIONAL"                     
## [15] "PERC_BIVALENT_ADDITIONAL"
new_dataset <- coverage_by_demo %>% select(COUNT_ADDITIONAL_CUMULATIVE, SUBGROUP, GROUP, POP_DENOMINATOR)
head(new_dataset)
##   COUNT_ADDITIONAL_CUMULATIVE SUBGROUP   GROUP POP_DENOMINATOR
## 1                         212     '0-4     Age        523718.0
## 2                       68382    '5-12     Age        747558.7
## 3                      125097   '13-17     Age        432845.5
## 4                      193691 Citywide Borough       1704122.2
## 5                       27368    Bronx Borough        347020.4
## 6                       54083 Brooklyn Borough        575063.2
sorted_dataset <- new_dataset %>% arrange(SUBGROUP)
sorted_dataset
##     COUNT_ADDITIONAL_CUMULATIVE                      SUBGROUP          GROUP
## 1                        193691                         '0-17            Age
## 2                           212                          '0-4            Age
## 3                        125097                        '13-17            Age
## 4                        259784                        '18-24            Age
## 5                        259784                        '18-24            Age
## 6                        544806                        '25-34            Age
## 7                        544806                        '25-34            Age
## 8                        502207                        '35-44            Age
## 9                        502207                        '35-44            Age
## 10                       517086                        '45-54            Age
## 11                       517086                        '45-54            Age
## 12                        68382                         '5-12            Age
## 13                       582544                        '55-64            Age
## 14                       582544                        '55-64            Age
## 15                       465590                        '65-74            Age
## 16                       465590                        '65-74            Age
## 17                       224665                        '75-84            Age
## 18                       224665                        '75-84            Age
## 19                        77835                          '85+            Age
## 20                        77835                          '85+            Age
## 21                        57155                    Asian/NHPI Race/ethnicity
## 22                        57097                    Asian/NHPI Race/ethnicity
## 23                        36721                    Asian/NHPI Race/ethnicity
## 24                       730049                    Asian/NHPI Race/ethnicity
## 25                       787204                    Asian/NHPI Race/ethnicity
## 26                        23527                         Black Race/ethnicity
## 27                        23511                         Black Race/ethnicity
## 28                        16648                         Black Race/ethnicity
## 29                       493271                         Black Race/ethnicity
## 30                       516798                         Black Race/ethnicity
## 31                        27368                         Bronx        Borough
## 32                        27363                         Bronx        Borough
## 33                        19030                         Bronx        Borough
## 34                       423623                         Bronx        Borough
## 35                       450991                         Bronx        Borough
## 36                        54083                      Brooklyn        Borough
## 37                        53995                      Brooklyn        Borough
## 38                        32516                      Brooklyn        Borough
## 39                       865300                      Brooklyn        Borough
## 40                       919383                      Brooklyn        Borough
## 41                       193691                      Citywide        Borough
## 42                       193479                      Citywide        Borough
## 43                       125097                      Citywide        Borough
## 44                      3174517                      Citywide        Borough
## 45                      3368208                      Citywide        Borough
## 46                        99443                        Female            Sex
## 47                        99333                        Female            Sex
## 48                        65584                        Female            Sex
## 49                      1746356                        Female            Sex
## 50                      1845799                        Female            Sex
## 51                        48988               Hispanic/Latino Race/ethnicity
## 52                        48969               Hispanic/Latino Race/ethnicity
## 53                        34352               Hispanic/Latino Race/ethnicity
## 54                       687591               Hispanic/Latino Race/ethnicity
## 55                       736579               Hispanic/Latino Race/ethnicity
## 56                        93893                          Male            Sex
## 57                        93791                          Male            Sex
## 58                        59234                          Male            Sex
## 59                      1413385                          Male            Sex
## 60                      1507278                          Male            Sex
## 61                        48284                     Manhattan        Borough
## 62                        48221                     Manhattan        Borough
## 63                        27337                     Manhattan        Borough
## 64                       777827                     Manhattan        Borough
## 65                       826111                     Manhattan        Borough
## 66                          660                   Multiracial Race/ethnicity
## 67                          656                   Multiracial Race/ethnicity
## 68                           80                   Multiracial Race/ethnicity
## 69                         4695                   Multiracial Race/ethnicity
## 70                         5355                   Multiracial Race/ethnicity
## 71                          951 Native American/Alaska Native Race/ethnicity
## 72                          951 Native American/Alaska Native Race/ethnicity
## 73                          764 Native American/Alaska Native Race/ethnicity
## 74                        12164 Native American/Alaska Native Race/ethnicity
## 75                        13115 Native American/Alaska Native Race/ethnicity
## 76                         3478                         Other Race/ethnicity
## 77                           32                         Other            Sex
## 78                         3476                         Other Race/ethnicity
## 79                           32                         Other            Sex
## 80                         2728                         Other Race/ethnicity
## 81                           27                         Other            Sex
## 82                        91673                         Other Race/ethnicity
## 83                          853                         Other            Sex
## 84                        95151                         Other Race/ethnicity
## 85                          885                         Other            Sex
## 86                          158          Prefer not to answer            Sex
## 87                          158          Prefer not to answer            Sex
## 88                          112          Prefer not to answer            Sex
## 89                         4272          Prefer not to answer            Sex
## 90                         4430          Prefer not to answer            Sex
## 91                        55198                        Queens        Borough
## 92                        55147                        Queens        Borough
## 93                        39604                        Queens        Borough
## 94                       945978                        Queens        Borough
## 95                      1001176                        Queens        Borough
## 96                         8758                 Staten Island        Borough
## 97                         8753                 Staten Island        Borough
## 98                         6610                 Staten Island        Borough
## 99                       161789                 Staten Island        Borough
## 100                      170547                 Staten Island        Borough
## 101                        5892                       Unknown Race/ethnicity
## 102                         165                       Unknown            Sex
## 103                        5886                       Unknown Race/ethnicity
## 104                         165                       Unknown            Sex
## 105                        4161                       Unknown Race/ethnicity
## 106                         140                       Unknown            Sex
## 107                      105777                       Unknown Race/ethnicity
## 108                        9651                       Unknown            Sex
## 109                      111669                       Unknown Race/ethnicity
## 110                        9816                       Unknown            Sex
## 111                       53040                         White Race/ethnicity
## 112                       52933                         White Race/ethnicity
## 113                       29643                         White Race/ethnicity
## 114                     1049297                         White Race/ethnicity
## 115                     1102337                         White Race/ethnicity
##     POP_DENOMINATOR
## 1        1704122.18
## 2         523718.00
## 3         432845.48
## 4         704670.82
## 5         704670.82
## 6        1483699.00
## 7        1483699.00
## 8        1136906.00
## 9        1136906.00
## 10       1028087.00
## 11       1028087.00
## 12        747558.70
## 13        998927.00
## 14        998927.00
## 15        718795.00
## 16        718795.00
## 17        382672.00
## 18        382672.00
## 19        178938.00
## 20        178938.00
## 21        215999.98
## 22        148927.98
## 23         54811.75
## 24       1017642.02
## 25       1233642.00
## 26        373002.78
## 27        266476.78
## 28        104094.26
## 29       1452845.22
## 30       1825848.00
## 31        347020.43
## 32        246821.43
## 33         92110.57
## 34       1071186.57
## 35       1418207.00
## 36        575063.22
## 37        392137.22
## 38        140161.79
## 39       1984839.78
## 40       2559903.00
## 41       1704122.18
## 42       1180404.18
## 43        432845.48
## 44       6632694.82
## 45       8336817.00
## 46        834086.34
## 47        578537.34
## 48        212834.37
## 49       3524291.66
## 50       4358378.00
## 51        599599.62
## 52        423992.62
## 53        155999.08
## 54       1823990.38
## 55       2423590.00
## 56        870035.84
## 57        601866.84
## 58        220011.10
## 59       3108403.16
## 60       3978439.00
## 61        231257.76
## 62        155113.76
## 63         55915.07
## 64       1397448.24
## 65       1628706.00
## 66         56318.27
## 67         35031.27
## 68          9790.69
## 69         96578.73
## 70        152897.00
## 71          3843.07
## 72          3157.07
## 73          1562.20
## 74         15020.93
## 75         18864.00
## 76               NA
## 77               NA
## 78               NA
## 79               NA
## 80               NA
## 81               NA
## 82               NA
## 83               NA
## 84               NA
## 85               NA
## 86               NA
## 87               NA
## 88               NA
## 89               NA
## 90               NA
## 91        447805.83
## 92        310710.83
## 93        114792.04
## 94       1806052.17
## 95       2253858.00
## 96        102974.94
## 97         75620.94
## 98         29866.00
## 99        373168.06
## 100       476143.00
## 101              NA
## 102              NA
## 103              NA
## 104              NA
## 105              NA
## 106              NA
## 107              NA
## 108              NA
## 109              NA
## 110              NA
## 111       455358.46
## 112       302818.46
## 113       106587.51
## 114      2226617.54
## 115      2681976.00
dummy_data <- sorted_dataset %>%
  dummy_cols(c("GROUP", "SUBGROUP")) %>%
  select(-c(GROUP, SUBGROUP))

columns_to_remove <- c(
  "SUBGROUP_Unknown",
  "SUBGROUP_Other",
  "SUBGROUP_Prefer not to answer",
  "POP_DENOMINATOR",
  "SUBGROUP_'0-17"
)

dummy_data <- dummy_data %>%
  select(-c(`SUBGROUP_Unknown`, `SUBGROUP_Other`, `SUBGROUP_Prefer not to answer`,`POP_DENOMINATOR`,`SUBGROUP_'0-17`))
column_name_mapping <- c(
  "GROUP_Race/ethnicity" = "GROUP_Race_ethnicity",
  "SUBGROUP_'0-4" = "Age_0_4",
  "SUBGROUP_'5-12" = "Age_5_12",
  "SUBGROUP_'13-17" = "Age_13_17",
  "SUBGROUP_'18-24" = "Age_18_24",
  "SUBGROUP_'25-34" = "Age_25_34",
  "SUBGROUP_'35-44" = "Age_35_44",
  "SUBGROUP_'45-54" = "Age_45_54",
  "SUBGROUP_'55-64" = "Age_55_64",
  "SUBGROUP_'65-74" = "Age_65_74",
  "SUBGROUP_'75-84" = "Age_75_84",
  "SUBGROUP_'85+" = "Age_85_plus",
  "SUBGROUP_Asian/NHPI" = "Race_Asian_NHPI",
  "SUBGROUP_Hispanic/Latino" = "Race_Hispanic_Latino",
  "SUBGROUP_Native American/Alaska Native" = "Race_Native_American_Alaska_Native",
  "SUBGROUP_Staten Island" = "Race_Staten_Island",
  "SUBGROUP_Black" = "Race_Black",
  "SUBGROUP_White" = "Race_White",
  "SUBGROUP_Multiracial" = "Race_Multiracial",
  "SUBGROUP_Female" ="Gender_F",
  "SUBGROUP_Male" = "Gender_M"
)

dummy_data <- na.omit(dummy_data)
colnames(dummy_data) <- sapply(colnames(dummy_data), function(x) {
  if (x %in% names(column_name_mapping)) {
    return(column_name_mapping[x])
  } else {
    return(x)
  }
})
model <- lm(COUNT_ADDITIONAL_CUMULATIVE ~ ., data = dummy_data)
summary(model)
## 
## Call:
## lm(formula = COUNT_ADDITIONAL_CUMULATIVE ~ ., data = dummy_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1285901  -191232    -1207     7674  1957210 
## 
## Coefficients: (2 not defined because of singularities)
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            2060     132427   0.016  0.98763    
## GROUP_Age                            191631     529707   0.362  0.71840    
## GROUP_Borough                         69232     264854   0.261  0.79440    
## GROUP_Race_ethnicity                  40929     209385   0.195  0.84548    
## GROUP_Sex                                NA         NA      NA       NA    
## Age_0_4                             -193479     725331  -0.267  0.79030    
## Age_5_12                            -125309     725331  -0.173  0.86324    
## Age_13_17                            -68594     725331  -0.095  0.92487    
## Age_18_24                             66093     628155   0.105  0.91645    
## Age_25_34                            351115     628155   0.559  0.57762    
## Age_35_44                            308516     628155   0.491  0.62456    
## Age_45_54                            323395     628155   0.515  0.60798    
## Age_55_64                            388853     628155   0.619  0.53751    
## Age_65_74                            271899     628155   0.433  0.66619    
## Age_75_84                             30974     628155   0.049  0.96079    
## Age_85_plus                         -115856     628155  -0.184  0.85410    
## Race_Asian_NHPI                      290656     280920   1.035  0.30370    
## Race_Black                           171762     280920   0.611  0.54251    
## SUBGROUP_Bronx                       118384     324378   0.365  0.71603    
## SUBGROUP_Brooklyn                    313764     324378   0.967  0.33609    
## SUBGROUP_Citywide                   1339707     324378   4.130  8.3e-05 ***
## Gender_F                             769243     264854   2.904  0.00466 ** 
## Race_Hispanic_Latino                 268307     280920   0.955  0.34217    
## Gender_M                             631456     264854   2.384  0.01929 *  
## SUBGROUP_Manhattan                   274265     324378   0.846  0.40015    
## Race_Multiracial                     -40700     280920  -0.145  0.88514    
## Race_Native_American_Alaska_Native   -37400     280920  -0.133  0.89439    
## SUBGROUP_Queens                      348129     324378   1.073  0.28614    
## Race_Staten_Island                       NA         NA      NA       NA    
## Race_White                           414461     280920   1.475  0.14372    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 512900 on 87 degrees of freedom
## Multiple R-squared:  0.3463, Adjusted R-squared:  0.1434 
## F-statistic: 1.707 on 27 and 87 DF,  p-value: 0.03311
colnames(dummy_data) <- gsub("^SUBGROUP_", "", colnames(dummy_data))

rf_model <- randomForest(COUNT_ADDITIONAL_CUMULATIVE ~ ., data = dummy_data)
importance_data <- as.data.frame(importance(rf_model))
importance_data <- data.frame(
  Feature = row.names(importance_data),
  Importance = importance_data$IncNodePurity
)
importance_data_ordered <- importance_data[order(-importance_data$Importance), ]
print(importance_data)
##                               Feature   Importance
## 1                           GROUP_Age 2.546310e+11
## 2                       GROUP_Borough 6.857794e+11
## 3                GROUP_Race_ethnicity 3.457317e+11
## 4                           GROUP_Sex 4.254633e+11
## 5                             Age_0_4 5.547221e+10
## 6                            Age_5_12 3.569457e+10
## 7                           Age_13_17 1.703853e+10
## 8                           Age_18_24 1.625324e+10
## 9                           Age_25_34 1.304611e+11
## 10                          Age_35_44 1.052718e+11
## 11                          Age_45_54 1.076199e+11
## 12                          Age_55_64 1.703212e+11
## 13                          Age_65_74 7.106815e+10
## 14                          Age_75_84 1.851230e+10
## 15                        Age_85_plus 7.287943e+10
## 16                    Race_Asian_NHPI 2.125262e+11
## 17                         Race_Black 7.130117e+10
## 18                              Bronx 1.358951e+11
## 19                           Brooklyn 2.459527e+11
## 20                           Citywide 6.598033e+12
## 21                           Gender_F 1.849997e+12
## 22               Race_Hispanic_Latino 1.865358e+11
## 23                           Gender_M 1.089288e+12
## 24                          Manhattan 1.559977e+11
## 25                   Race_Multiracial 1.302214e+11
## 26 Race_Native_American_Alaska_Native 1.142840e+11
## 27                             Queens 2.761417e+11
## 28                 Race_Staten_Island 2.374544e+11
## 29                         Race_White 5.574445e+11
print(importance_data_ordered)
##                               Feature   Importance
## 20                           Citywide 6.598033e+12
## 21                           Gender_F 1.849997e+12
## 23                           Gender_M 1.089288e+12
## 2                       GROUP_Borough 6.857794e+11
## 29                         Race_White 5.574445e+11
## 4                           GROUP_Sex 4.254633e+11
## 3                GROUP_Race_ethnicity 3.457317e+11
## 27                             Queens 2.761417e+11
## 1                           GROUP_Age 2.546310e+11
## 19                           Brooklyn 2.459527e+11
## 28                 Race_Staten_Island 2.374544e+11
## 16                    Race_Asian_NHPI 2.125262e+11
## 22               Race_Hispanic_Latino 1.865358e+11
## 12                          Age_55_64 1.703212e+11
## 24                          Manhattan 1.559977e+11
## 18                              Bronx 1.358951e+11
## 9                           Age_25_34 1.304611e+11
## 25                   Race_Multiracial 1.302214e+11
## 26 Race_Native_American_Alaska_Native 1.142840e+11
## 11                          Age_45_54 1.076199e+11
## 10                          Age_35_44 1.052718e+11
## 15                        Age_85_plus 7.287943e+10
## 17                         Race_Black 7.130117e+10
## 13                          Age_65_74 7.106815e+10
## 5                             Age_0_4 5.547221e+10
## 6                            Age_5_12 3.569457e+10
## 14                          Age_75_84 1.851230e+10
## 7                           Age_13_17 1.703853e+10
## 8                           Age_18_24 1.625324e+10
age_colors <- c("Age_0_4" = "red","Age_5_12" = "red", 
                "Age_13_17" = "red", "Age_18_24" = "red", "Age_25_34" = "red", 
                "Age_35_44" = "red", "Age_45_54" = "red", "Age_55_64" = "red", 
                "Age_65_74" = "red", "Age_75_84" = "red", "Age_85_plus" = "red")

race_colors <- c("Race_Asian_NHPI" = "green", "Race_Black" = "green", 
                 "Race_Multiracial" = "green", "Race_Native_American_Alaska_Native" = "green", 
                 "Race_Hispanic_Latino" = "green", "Race_White" = "green","Race_Staten_Island" = "green")

borough_colors <- c("Bronx" = "orange", "Brooklyn" = "orange", 
                    "Citywide" = "orange", "Manhattan" = "orange", "Queens" = "orange")

gender_colors <- c("Gender_F" = "pink", "Gender_M" = "pink")



ggplot(importance_data, aes(x = Feature, y = Importance, fill = Feature)) +
  geom_bar(stat = "identity", width = 0.7) +
  labs(x = "Feature", y = "Importance") +
  ggtitle("Feature Importance with Booster") +
  scale_fill_manual(values = c(age_colors, race_colors, borough_colors, gender_colors)) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = 4, vjust = 0.5)) + 
  theme(axis.text.y  = element_text(angle = 0, hjust = 0.5, size = 4, vjust = 0.5))+
  theme(plot.title = element_text(hjust = 0.2, size = 16, face = "bold")) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  coord_flip() + 
  scale_y_continuous(labels = scales::comma) +
  theme(
    plot.background = element_rect(fill = "white"), 
    plot.title.position = "plot",
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold"),
    legend.position = "bottom"  
  ) +
  labs(fill = "Feature") +  
  theme(
    legend.text = element_text(size = 4), 
    legend.title = element_text(size = 4, face = "bold")  
  ) +
  theme(legend.box = "horizontal")  

#install.packages(c("tm", "stringr", "textclean"))
#install.packages("wordcloud")
#install.packages("plotly")
#install.packages("textdata")
#install.packages("afinn")
#install.packages("tidytext")
#install.packages("treemap")
library(readr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(stringr)
library(textclean)
library(readr)
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(dplyr)
library(treemap)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
covid <- utils::read.csv("train.csv", sep = "|", stringsAsFactors = FALSE)
colnames(covid)
## [1] "Text"  "Label"
age_pattern <- "\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35)\\b|\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35) years? old\\b|\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35) y/o\\b|\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35) yo\\b"
#age_pattern <- "\\bage\\b|\\byears? old\\b|\\by/o\\b|\\byo\\b|aged"
covid_pattern <- "\\bcovid\\b|\\bcoronavirus\\b"
vaccine_pattern <- "\\bvaccine\\b|\\bvaccination\\b|\\bimmunization\\b"
race_pattern <- "\\bWhite\\b|\\bBlack\\b|\\bAfrican American\\b|\\bAsian\\b|\\bHispanic\\b|\\bLatino\\b"
usa_pattern <- "\\bUnited States\\b|\\bUSA\\b|\\bAmerica\\b|\\bAmerican\\b"
filtered_data <- covid[with(covid, str_detect(Text, age_pattern) | 
                                     str_detect(Text, covid_pattern) | 
                                     str_detect(Text, vaccine_pattern) | 
                                     str_detect(Text, race_pattern) | 
                                     str_detect(Text, usa_pattern)), ]

head(filtered_data)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Text
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 I do agree with you but I also think USA has a few differences that will make things worse here1 general poor health if people Many comorbidities like hbp diabetes coronary disease are extremely common and considered normal for anyone over 402 number of hospital beds and especially icu beds per capital are much lower than sk3 access to healthcare is much more difficult here than in sk  Many people will not be able to afford it or afford time off work etc making them avoid treatment until its to late or until theyve continued spreading4 we simply arent testing which increases community spread This wont effect things in a percentage way but will increase infected and death totals
## 47                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        I dont know how you got to it you seem a little confused about it They are referring to this studyhttpswwwpharmaceuticaltechnologycomnewscoronavirusincubationperiod27days which is the only one to come up with 27 days for a possible incubation period All of the other studies done have concluded that the incubation period is 14 days or less 27 is the outlier
## 64                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 2 of 50k is 10001000 restraunts employing lets say 12 people each thats 12k 12k people lets assume they aall get min wage thats 12k72587k an hour Thats 348 million a week for however many weeks Because each of the 12 only works say 40 hours a week 87k40The ceo of Yum made 22 million in 2013 sorry most recent I could find 348m is 158 of 22m Theyre paying 15 a week of the ceos salary for that Thats not exactly chump change For perspective thats over 5 what god demands be given to the church
## 102 For those who cant see the post I copied it in two parts in the comments below see Part 2 in another commenthttpswwwredditcomrCoronaviruscommentsffa2tftestimonyofasurgeonworkinginbergamointhehttpswwwredditcomrCoronaviruscommentsffa2tftestimonyofasurgeonworkinginbergamointhePART 1throwaway1Dgrzqhttpswwwredditcomuserthrowaway1Dgrzq 515 points15 hours agohttpswwwredditcomrCoronaviruscommentsffa2tftestimonyofasurgeonworkinginbergamointhefjx5tc8edited 14 hours agolate edit I need to add a disclaimer dont go to the linked thread if you have any sort of anxiety issues that other sub is extremely grim and not supportive of mental health I apologize for the late warningEnglish translation by mcgianni on rmedicinehttpsnpredditcomrmedicinecommentsff8hnstestimonyofasurgeonworkinginbergamointhe NPlinked please do not participate if you are a nonmedical professionalIn one of the nonstop emails that I receive from my hospital administration on a more than daily basis there was a paragraph on how to be responsible on social media with some recommendations that we all can agree on After thinking for a long time if and what to write about whats happening here I felt that silence was not responsible I will therefore try to convey to laypeople those who are more distant from our reality what we are experiencing in Bergamo during these Covid19 pandemic days I understand the need not to panic but when the message of the danger of what is happening is not out and I still see people ignoring the recommendations and people who gather together complaining that they cannot go to the gym or play soccer tournaments I shiver I also understand the economic damage and I am also worried about that After this epidemic it will be hard to start over    Still beside the fact that we are also devastating our national health system from an economic point of view I want to point out that the public health damage that is going to invest the country is more important and I find it nothing short of chilling that new quarantine areas requested by the Region has not yet been established for the municipalities of Alzano Lombardo and Nembro I would like to clarify that this is purely personal opinion I myself looked with some amazement at the reorganization of the entire hospital in the previous week when our current enemy was still in the shadows the wards slowly emptied elective activities interrupted intensive care unit freed to create as many beds as possible Containers arriving in front of the emergency room to create diversified routes and avoid infections All this rapid transformation brought in the hallways of the hospital an atmosphere of surreal silence and emptiness that we did not understand waiting for a war that had yet to begin and that many including me were not so sure would never come with such ferocity I open a parenthesis all this was done in the shadows and without publicity while several newspapers had the courage to say that private health care was not doing anything    I still remember my night shift a week ago spent without any rest waiting for a call from the microbiology department I was waiting for the results of a swab taken from the first suspect case in our hospital thinking about what consequences it would have for us and the hospital If I think about it my agitation for one possible case seems almost ridiculous and unjustified now that I have seen what is happening Well the situation is now nothing short of dramatic No other words come to mind The war has literally exploded and battles are uninterrupted day and night One after the other these unfortunate people come to the emergency room They have far from the complications of a flu Lets stop saying its a bad flu In my two years working in Bergamo I have learned that the people here do not come to the emergency room for no reason They did well this time too They followed all the recommendations given a week or ten days at home with a fever without going out to prevent contagion but now they cant take it anymore They dont breathe enough they need oxygen Drug therapies for this virus are few    The course mainly depends on our organism We can only support it when it cant take it anymore It is mainly hoped that our body will eradicate the virus on its own lets face it Antiviral therapies are experimental on this virus and we learn its behavior day after day Staying at home until the symptoms worsen does not change the prognosis of the disease Now however that need for beds in all its drama has arrived One after another the departments that had been emptied are filling up at an impressive rate The display boards with the names of the sicks of different colors depending on the department they belong to are now all red and instead of the surgical procedure there is the diagnosis which is always the same bilateral interstitial pneumonia Now tell me which flu virus causes such a rapid tragedy    Because thats the difference now I get a little technical in classical flu besides that it infects much less population over several months cases are complicated less frequently only when the virus has destroyed the protective barriers of our airways and as such it allows bacteria which normally resident in the upper airways to invade the bronchi and lungs causing a more serious disease Covid 19 causes a banal flu in many young people but in many elderly people and not only a real SARS because it invades the alveoli of the lungs directly and it infects them making them unable to perform their function The resulting respiratory failure is often serious and after a few days of hospitalization the simple oxygen that can be administered in a ward may not be enough Sorry but to me as a doctor its not reassuring that the most serious are mainly elderly people with other pathologies The elderly population is the most represented in our country and it is difficult to find someone who above 65 years of age does not take at least a pill for high blood pressure or diabetes    I can also assure you that when you see young people who end up intubated in the ICU pronated or worse in ECMO a machine for the worst cases which extracts the blood reoxygenates it and returns it to the body waiting for the lungs to hopefully heal all this confidence for your young age goes away And while there are still people on social media who boast of not being afraid by ignoring the recommendations protesting that their normal lifestyle habits have temporarily halted the epidemiological disaster is taking place And there are no more surgeons urologists orthopedists we are only doctors who suddenly become part of a single team to face this tsunami that has overwhelmed us12See my other comment for PART 2
## 104                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              It was just a stupid snobby thing to say by the elites and the expertsIts not a coincidence that East Asian countries have had so much success When  there is so much asymptomatic transmission its better of if we tell people to wear it not that its useless
## 132                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Government confirmation    press release   01282020 No 12  GP Download PDFhttpswwwstmgpbayerndepressedreiweiterecoronavirusfaelleinbayernzusammenhangmitdemerstenfallbayernsoutputpdfThree more coronavirus cases in Bavaria  connection with the  first case  Bavarias Minister of Health Huml around 40 people should  be tested on Wednesday as a precaution  The Bavarian Ministry  of Health was informed on Tuesday evening that three other people in  Bavaria had been infected with the novel corona virus These patients  are also employees of the company from the district of Starnberg where  the first person affected is already employed This man had apparently  contracted a Chinese colleague during a training session on January 21  This colleague flew back to China on January 23 On January 27 the  company informed the health department of the Chinese womans illness   It was decided that the three new patients should also be admitted to  the Munich Clinic Schwabing where they would be medically monitored and  isolated A few other contact persons are currently testing whether  they are infected with the corona virus The Bavarian Ministry of Health  and the State Office for Health and Food Safety LGL will report on  details in a press release on Wednesday   Bavarias Minister of Health Melanie Huml emphasized on Tuesday  evening in Munich A total of around 40 employees of the company were  identified who could be considered as close contacts Those affected  should be tested on Wednesday as a precautionary measure They will also  be examined by the LGLs Task Force Infectiology questioned in  detail 
##     Label
## 3       0
## 47      0
## 64      1
## 102     0
## 104     1
## 132     0
corpus <- Corpus(VectorSource(filtered_data$Text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
tokenize <- function(x) {
  unlist(str_split(x, pattern = " "))
}
corpus <- tm_map(corpus, content_transformer(tokenize))
sentiment_data <- data.frame(text = sapply(corpus, as.character))
text_combined <- paste(unlist(corpus), collapse = " ")
wordcloud_obj <- wordcloud(text_combined, max.words = 100, scale = c(5, 0.8), colors = brewer.pal(8, "Dark2"))

sentiment_data <- sentiment_data %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(word) %>%
  summarise(sentiment_score = sum(value))
## Joining with `by = join_by(word)`
positive_words <- sentiment_data %>%
  filter(sentiment_score >= 0.5) %>%
  top_n(20, wt = sentiment_score)

negative_words <- sentiment_data %>%
  filter(sentiment_score <= 0.5) %>%
  top_n(20, wt = sentiment_score)

head(negative_words)
## # A tibble: 6 × 2
##   word       sentiment_score
##   <chr>                <dbl>
## 1 apologise               -1
## 2 apologized              -1
## 3 awaits                  -1
## 4 axe                     -1
## 5 cancels                 -1
## 6 clouded                 -1
head(positive_words)
## # A tibble: 6 × 2
##   word    sentiment_score
##   <chr>             <dbl>
## 1 best                918
## 2 better              990
## 3 care               1092
## 4 focused             352
## 5 good               1953
## 6 great               645
positive_words <- positive_words[order(-positive_words$sentiment_score), ]
positive_words
## # A tibble: 20 × 2
##    word      sentiment_score
##    <chr>               <dbl>
##  1 like                 3968
##  2 good                 1953
##  3 care                 1092
##  4 better                990
##  5 best                  918
##  6 help                  694
##  7 positive              652
##  8 great                 645
##  9 please                581
## 10 hope                  532
## 11 support               516
## 12 want                  478
## 13 healthy               390
## 14 true                  390
## 15 united                365
## 16 welcome               356
## 17 focused               352
## 18 lol                   351
## 19 important             332
## 20 novel                 322
fig <- plot_ly(
  labels = positive_words$word,
  parents = "",
  values = positive_words$sentiment_score,
  type = "treemap",
  path = ~word,
  ids = ~word,
  textinfo = "label+value"
)

fig <- fig %>% layout(
  title = "20 Most Positive Words, Age 15-34, All Race",
  font = list(size = 14),
  margin = list(l = 0, r = 0, b = 0, t = 40)
)

fig
fig <- plot_ly(
  labels = negative_words$word,
  parents = "",
  values = abs(negative_words$sentiment_score), 
  type = "treemap"
)

fig <- fig %>% layout(
  title = "20 Most Negative Words",
  font = list(size = 14),
  margin = list(l = 0, r = 0, b = 0, t = 40)
)

fig
total_positive_score <- sum(positive_words$sentiment_score)
total_negative_score <- sum(negative_words$sentiment_score)

threshold_value <- 5

if (total_positive_score - total_negative_score > threshold_value) {
  overall_sentiment <- "Positive"
} else if (total_positive_score - total_negative_score < threshold_value) {
  overall_sentiment <- "Negative"
} else {
  overall_sentiment <- "Neutral"
}

cat("Overall Sentiment:", overall_sentiment, "\n")
## Overall Sentiment: Positive